In [9]:
# Aux Functions
def labels_dictionary():
file = open('data/communities.names_edit.txt', 'r')
labels = dict()
for index, line in enumerate(file):
line = line[:-1]
labels[index] = line # Allow to read the label by the index
labels[line] = index # Allow to read the index by the label
return labels
def get_labels(labels_dic):
labels = []
for i in range (128):
labels.append(labels_dic[i])
return labels
def columns_to_remove(communities, missing_data_percentage):
columns = []
incomplete_values = communities.isnull().sum()
incomplete_percent = (incomplete_values/communities.shape[0]*100)
for i in range (communities.shape[1]):
if incomplete_percent[i] > missing_data_percentage:
columns.append(i)
return columns
# incomplete_values = communities.isnull().sum()
# incomplete_values_percent = (incomplete_values/communities.shape[0]*100)
# print("Percent of Incomple values")
# print(list(map(lambda x: x>75, incomplete_values_percent)))
In [10]:
import pandas
import numpy
labels_dic = labels_dictionary()
communities = pandas.read_csv('data/communities.data.txt', sep=",", names=get_labels(labels_dic), encoding='utf-8')
communities = communities.replace('?', numpy.NaN)
col_to_remove = columns_to_remove(communities, 75) # Columns with more than 75% of missing data are removed
print("More than 75% incomplete:")
list_of_incomplete = list(map(lambda x: labels_dic[x], col_to_remove))
print(list_of_incomplete)
print("----\n")
X = communities.iloc[:, 0:127] # OR .drop(labels='ViolentCrimesPerPop numeric', axis=1)
X = X.drop(labels=list_of_incomplete, axis=1)
X = X.drop(labels=['communityname string'], axis=1) #Temporary, change later
print("X new shape: ", X.shape, "\n")
Y = communities.iloc[:, [127]]
# df = pandas.DataFrame(X, columns=list(set(X['communityname string'])) )
# dummies = pandas.get_dummies(df)
# X.join(dummies)
# print(X.shape)
# X_val = X.values
# Y_val = Y.values
#print(X_val)
#print(Y_val)
In [11]:
X_rem = X
X_rem.dropna(inplace=True, axis='columns')
X_rem = X_rem.values
Y_rem = Y.values
#print(X_rem.shape)
from sklearn import linear_model
lm = linear_model.LinearRegression()
from sklearn.cross_validation import cross_val_predict
from sklearn import metrics
predictions = cross_val_predict(lm, X_rem, Y_rem, cv=6)
r2_score = metrics.r2_score(Y_rem, predictions)
print(r2_score)
mean_squared_error = metrics.mean_squared_error(Y_rem, predictions)
print(mean_squared_error)
In [12]:
from pandas import read_csv
from sklearn.preprocessing import Imputer
X_val = X.values
Y_val = Y.values
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_imp = imputer.fit_transform(X_val)
from sklearn import linear_model
lm = linear_model.LinearRegression()
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics
predictions = cross_val_predict(lm, X_val, Y_val, cv=6)
r2_score = metrics.r2_score(Y_val, predictions)
print(r2_score)
mean_squared_error = metrics.mean_squared_error(Y_val, predictions)
print(mean_squared_error)
In [13]:
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn import linear_model
from sklearn.decomposition import KernelPCA
X_val = X.values
Y_val = Y.values
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_imp = imputer.fit_transform(X_val)
#kpca = KernelPCA(n_components=50, kernel='linear')
kpca = KernelPCA(n_components=50, kernel='poly', degree=3)
X_KPCA = kpca.fit_transform(X_val)
lm = linear_model.LinearRegression()
predictions = cross_val_predict(lm, X_KPCA, Y_val, cv=6)
r2_score = metrics.r2_score(Y_val, predictions)
print(r2_score)
mean_squared_error = metrics.mean_squared_error(Y_val, predictions)
print(mean_squared_error)
In [ ]:
In [ ]: